Set up packages, chart themes, slack notifications etc
#load the user data
user <- read_csv("../../data/user_event/mode_users.csv",
col_types = list(created_at = col_datetime(),
activated_at = col_datetime()) )
user <-tbl_df(user)
#load the event data
event <- read_csv("../../data/user_event/mode_user_events.csv",
col_types = list(occurred_at = col_datetime()))
event <- tbl_df(event)
#view the number of rows and cols
dim(user)
## [1] 18911 6
dim(event)
## [1] 285260 6
#glimpse the structure
glimpse(user)
## Observations: 18911
## Variables:
## $ user_id (dbl) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ created_at (time) 2013-01-01 14:32:28, 2013-01-01 09:56:58, 2013-0...
## $ company_id (dbl) 5373, 1877, 6135, 12910, 8966, 792, 92, 5370, 907...
## $ language (chr) "french", "indian", "english", "english", "englis...
## $ activated_at (time) NA, NA, 2013-01-01 18:21:55, NA, 2013-01-01 05:3...
## $ state (chr) "pending", "pending", "active", "pending", "activ...
glimpse(event)
## Observations: 285260
## Variables:
## $ user_id (dbl) 8546, 8546, 8546, 8546, 8546, 8546, 8546, 8546, 85...
## $ occurred_at (time) 2014-05-02 13:21:16, 2014-05-02 13:21:52, 2014-05...
## $ event_type (chr) "engagement", "engagement", "engagement", "engagem...
## $ event_name (chr) "login", "like_message", "home_page", "like_messag...
## $ location (chr) "Indonesia", "Indonesia", "Indonesia", "Indonesia"...
## $ device (chr) "macbook pro", "macbook pro", "macbook pro", "macb...
#view the top few rows
datatable(head(user))
datatable(head(event))
#keep only required columns in user and event dataframes
user <-
user %>%
select(user_id,
activated_at,
state,
language)
event <-
event %>%
select(user_id,
event_name,
occurred_at,
device,
location)
#build the cohort dataframe
cohort_df <-
user %>%
#only keep users that were activated May 1 2014 onwards
filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
#get the week starting date as cohort
#mutate(cohort = floor_date(activated_at, "week") + days(1) ) %>% #to make the week start on Monday
mutate(cohort = floor_date(activated_at, "week") ) %>%
#join user activity(events table)
inner_join(event, by = c("user_id" = "user_id")) %>%
#create column to store diff in weeks from activation date for each event/activity
mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
group_by(cohort) %>%
#for each cohort calculate the longest observation window
mutate(cutoff_age = max(period_age)) %>%
group_by(cohort, cutoff_age, period_age) %>%
#count the no. of users with activity in each observation time point
summarise(tally = n_distinct(user_id)) %>%
#store the starting no. of users in the cohort
mutate(first_period = max(tally)) %>%
#compute retention rate for each time point
mutate(retention = tally/first_period) %>%
ungroup() %>%
arrange(cohort, cutoff_age, period_age)
#change layout to wide view
cohort_visual_df <-
cohort_df %>%
select(cohort, first_period, period_age, retention) %>%
mutate(retention = round(retention, digits = 2)) %>%
spread(period_age, retention) %>%
arrange(cohort)
datatable(cohort_visual_df)
cohort_df %>%
filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
geom_line(alpha = 0.5, size = 0.75, aes(group = cohort, color = as.character(cohort))) +
scale_color_manual(values = tableau_color_pal("tableau20")(20)) +
scale_y_continuous(labels = percent, limits = c(0,1)) +
scale_x_discrete() +
labs(x="Weeks after signup ", y = "Retention rate") +
chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1, mhgrid = 1) +
legend_show(position = "right") +
guides(color = guide_legend(title = "Week Starting"))
#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")
cohort_df %>%
filter(period_age > 0) %>%
#mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort )) +
geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
scale_fill_gradientn(colours = cols(5)) +
geom_text(color = "black", size = 3.5, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
scale_y_datetime(breaks = date_breaks("1 week"), labels = date_format("%b - %d")) +
expand_limits(x = -1) + #to accomodate the week0 text labels
scale_x_discrete(limits = seq(1, 17, 1)) +
labs(x="Retention rate by weeks after signup\n ", y = "Signup week") +
chart_theme_custom_base(fsize = 14, vgrid = 0, hgrid = 0, font_family = "") +
theme(axis.text.y=element_text(vjust = 1)) +
legend_show(size_label_font = 10, position = "bottom") +
guides(fill = guide_colorbar( barwidth = 10, barheight = 0.5, title.vjust = 1, title = "Retention Rate" ))
#build the cohort dataframe
cohort_df <-
user %>%
#only keep users that were activated May 1 2014 onwards
filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
#set the user language as cohort
mutate(cohort = language) %>%
#join user activity(events table)
inner_join(event, by = c("user_id" = "user_id")) %>%
#create column to store diff in weeks from activation date for each event/activity
mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
group_by(cohort) %>%
#for each cohort calculate the longest observation window
mutate(cutoff_age = max(period_age)) %>%
group_by(cohort, cutoff_age, period_age) %>%
#count the no. of users with activity in each observation time point
summarise(tally = n_distinct(user_id)) %>%
#store the starting no. of users in the cohort
mutate(first_period = max(tally)) %>%
#compute retention rate for each time point
mutate(retention = tally/first_period) %>%
ungroup() %>%
arrange(cohort, cutoff_age, period_age)
#change layout
cohort_visual_df <-
cohort_df %>%
select(cohort, first_period, period_age, retention) %>%
mutate(retention = round(retention, digits = 2)) %>%
spread(period_age, retention) %>%
arrange(cohort)
datatable(cohort_visual_df)
cohort_df %>%
filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
geom_line(alpha = 0.8, size = 0.5, aes(group = cohort, color = cohort)) +
scale_color_manual(values = tableau_color_pal("tableau20")(20)) +
scale_y_continuous(labels = percent, limits = c(0,1)) +
scale_x_discrete() +
labs(x="Weeks after signup ", y = "Retention rate") +
chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1) +
legend_show(position = "right", show_title = 0) +
legend_size(symbol_size = 1)
#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")
cohort_df %>%
filter(period_age > 0) %>%
mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort )) +
geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
scale_fill_gradientn(colours = cols(5)) +
geom_text(color = "black", size = 3.5, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
expand_limits(x = -1) + #to accomodate the week0 text labels
scale_x_discrete(limits = seq(1, 17, 1)) +
labs(x="Retention rate by weeks after signup ", y = "Language") +
chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 0, hgrid = 0)
#see the different status codes. We are looking for signup completion
freq_dist(event, event_name) %>% print(n = 50)
## Source: local data frame [21 x 3]
##
## event_name n perc
## 1 home_page 76239 26.7%
## 2 like_message 48124 16.9%
## 3 view_inbox 45349 15.9%
## 4 login 33854 11.9%
## 5 send_message 26882 9.4%
## 6 search_autocomplete 15996 5.6%
## 7 search_run 11679 4.1%
## 8 create_user 7159 2.5%
## 9 enter_email 4303 1.5%
## 10 enter_info 3812 1.3%
## 11 complete_signup 3631 1.3%
## 12 search_click_result_2 1182 0.4%
## 13 search_click_result_1 1165 0.4%
## 14 search_click_result_4 1068 0.4%
## 15 search_click_result_3 1019 0.4%
## 16 search_click_result_5 855 0.3%
## 17 search_click_result_9 688 0.2%
## 18 search_click_result_6 682 0.2%
## 19 search_click_result_7 568 0.2%
## 20 search_click_result_8 562 0.2%
## 21 search_click_result_10 443 0.2%
#get list of users who completed signup
users_signed_up <-
event %>%
filter(event_name == "complete_signup") %>%
#set the user language as cohort
mutate(cohort = device) %>%
filter(!is.na(cohort)) %>%
select(user_id, cohort)
#build the cohort dataframe
cohort_df <-
user %>%
#only keep users that were activated May 1 2014 onwards
filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
#join users who competed sign up
inner_join(users_signed_up, by = c("user_id" = "user_id")) %>%
#join user activity(events table)
inner_join(event, by = c("user_id" = "user_id")) %>%
#create column to store diff in weeks from activation date for each event/activity
mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
group_by(cohort) %>%
#for each cohort calculate the longest observation window
mutate(cutoff_age = max(period_age)) %>%
group_by(cohort, cutoff_age, period_age) %>%
#count the no. of users with activity in each observation time point
summarise(tally = n_distinct(user_id)) %>%
#store the starting no. of users in the cohort
mutate(first_period = max(tally)) %>%
#compute retention rate for each time point
mutate(retention = tally/first_period) %>%
ungroup() %>%
arrange(cohort, cutoff_age, period_age)
#change layout
cohort_visual_df <-
cohort_df %>%
select(cohort, first_period, period_age, retention) %>%
mutate(retention = round(retention, digits = 2)) %>%
spread(period_age, retention) %>%
arrange(desc(first_period))
datatable(cohort_visual_df)
cohort_df %>%
filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
geom_line(alpha = 0.8, size = 0.5, aes(group = cohort, colour = cohort)) +
scale_y_continuous(labels = percent, limits = c(0,1)) +
scale_x_discrete() +
labs(x="Weeks after signup ", y = "Retention rate") +
chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1, mhgrid = 1) +
legend_show(position = "right", show_title = 0) +
legend_size(symbol_size = 1)
#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")
blues = brewer.pal(9, "Blues")
cohort_df %>%
filter(period_age > 0) %>%
mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort )) +
geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
scale_fill_gradientn(colours = cols(5)) +
geom_text(color = "black", size = 3, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
expand_limits(x = -1) + #to accomodate the week0 text labels
scale_x_discrete(limits = seq(1, 17, 1)) +
labs(x="Retention rate by weeks after signup ", y = "Device used for signup") +
chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 0, hgrid = 0)
#build the cohort dataframe
cohort_layer_df <-
user %>%
#only keep users that were activated May 1 2014 onwards
filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
#get the week starting date as cohort
#mutate(cohort = floor_date(activated_at, "week") + days(1) ) %>% #to make the week start on Monday
mutate(cohort = floor_date(activated_at, "week") ) %>%
#join user activity(events table)
inner_join(event, by = c("user_id" = "user_id")) %>%
#create column to store diff in weeks from activation date for each event/activity
mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
mutate(week = floor_date(occurred_at, "week" )) %>%
#group_by(cohort) %>%
#for each cohort calculate the longest observation window
#mutate(cutoff_age = max(period_age)) %>%
#group_by(cohort, cutoff_age, period_age) %>%
group_by(cohort, week) %>%
#count the no. of users with activity in each observation time point
summarise(tally = n_distinct(user_id))
#change layout to wide view
cohort_layer_visual_df <-
cohort_layer_df %>%
spread(week, tally) %>%
arrange(cohort)
datatable(cohort_layer_visual_df)
#build the layered cohort chart
# setup the color palette
blues = colorRampPalette((brewer.pal(5, "Blues")), space="Lab")
cohort_layer_visual_df %>%
gather(week, tally, -cohort) %>%
mutate(tally=replace(tally, is.na(tally), 0)) %>%
mutate(week = ymd(week)) %>%
filter(week < "2014-08-30") %>%
mutate(cohort = as.factor(cohort)) %>%
ggplot(aes(x = week, y = tally, fill = cohort, group = cohort)) +
geom_area(color = "white", size = 0.01, aes(fill = cohort), alpha = 1) +
chart_theme_custom_base(mhgrid = 1, mvgrid = 1) +
legend_show(position = "right", show_title = 0, size_label_font = 10) +
scale_x_datetime() +
scale_fill_manual(values = blues(19)) +
guides(fill = guide_legend(reverse = TRUE)) +
labs(x="Weeks", y = "Number of users active in a week")
d1 <- ymd_hms("2014-06-01 13:35:52")
d1
## [1] "2014-06-01 13:35:52 UTC"
d2 <- ymd_hms("2014-06-03 13:35:52")
d2
## [1] "2014-06-03 13:35:52 UTC"
d1 - d2
## Time difference of -2 days
ratio <- .51
sprintf("%1.0f%%", 100*ratio)
## [1] "51%"
col_pal = brewer.pal(3, "Set2")
#display.brewer.all() #RdYIGn
# View 5 colors for the Set2 palette
display.brewer.pal(7,"RdYlGn")
col_pal = brewer.pal(7, "RdYlGn")
col_pal[1]
## [1] "#D73027"
display.brewer.pal(9,"Blues")
blues = brewer.pal(9, "Blues")
col_pal[1]
## [1] "#D73027"